1 Statefarm Kaggle submission (fast.ai homework3)

3 VGG16() setup boilerplate

4 Load in data with generators

5 Finetuning the model

6 New model architecture

6.1 Batch normalisation

6.2 Data augmentation

6.2.1 Some minor debug settings

6.2.2 Conv stack output

6.2.3 Test convolutions

6.2.4 Save everything to disk

6.3 Train fully connected layers only

6.3.1 Import data from disk

6.3.2 Use data to train model

6.3.3 Load weights from trained model, and generate predictions

6.4 Convert to proper CSV

Statefarm Kaggle submission (fast.ai homework3)

What I'll need to do:

set up data structure into sample, train, valid, test
Import VGG16
pop the top layer, train it
set all fully connected layers to trainable
Improvements:
- play with dropout parameter
- add data augmentation
- stack multiple versions of the classifier
- apply batch norm
- have a setup that adjusts learning rate

These are general imports, always make sure to run these



In [1]:

    
import os
import zipfile
import shutil
import csv
import bcolz
os.environ["KERAS_BACKEND"] = "theano"
import keras
import numpy as np
from keras.utils.data_utils import get_file
from keras.models import load_model
from keras.layers.normalization import BatchNormalization
from keras.layers import Dense, Dropout, Flatten, Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical

model_url = "http://files.fast.ai/models/"
model_name = "vgg16.h5"
cache_dir = "models"









    



Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is enabled with initial size: 90.0% of memory, cuDNN 5103)
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/sandbox/cuda/__init__.py:600: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5.
  warnings.warn(warn)

Data structure

First we set up the data structure, with proper:

sample (about 10% of the data)
train
validation (about 15% of the data)
test directories in the processed directory



In [ ]:

    
raw_path = os.path.join(os.getcwd(), os.pardir, 'data', 'raw')
processed_path = os.path.join(os.getcwd(), os.pardir, 'data', 'processed')

# Make directories sample, valid, train, test, first check if this whole step is necessary
if os.path.exists(os.path.join(processed_path, 'sample')):
    print 'Sample directory already exists, no need to do data structuring!'
else:
    os.mkdir(os.path.join(processed_path, 'sample'))
    os.mkdir(os.path.join(processed_path, 'sample', 'train'))
    os.mkdir(os.path.join(processed_path, 'sample', 'valid'))
    os.mkdir(os.path.join(processed_path, 'valid'))
    
    # Extract Kaggle zipfiles to correct path
    print 'Extracting zips, this may take a while...'
    img_zip_handle = zipfile.ZipFile(os.path.join(raw_path, 'imgs.zip'), 'r')
    img_zip_handle.extractall(processed_path)
    img_zip_handle.close()
    
    csv_zip_handle = zipfile.ZipFile(os.path.join(raw_path, 'driver_imgs_list.csv.zip'), 'r')
    csv_zip_handle.extractall(processed_path)
    csv_zip_handle.close()
    print 'Done extracting zips!'
    
    # Set up sample directory structure
    for i in range(10):
        dirname = 'c' + str(i)
        os.mkdir(os.path.join(processed_path, 'sample', 'train', dirname))
        os.mkdir(os.path.join(processed_path, 'sample', 'valid', dirname))
        os.mkdir(os.path.join(processed_path, 'valid', dirname))
        
    os.mkdir(os.path.join(processed_path, 'test', 'unknown'))
    for filename in os.listdir(os.path.join(processed_path, 'test')):
        if filename.endswith('.jpg'):
            src = os.path.join(processed_path, 'test', filename)
            dest = os.path.join(processed_path, 'test', 'unknown', filename)
            shutil.move(src, dest)
        
    data = np.genfromtxt(os.path.join(processed_path, 'driver_imgs_list.csv'), delimiter=',', dtype=None)
    data = data[1:,:]
    drivers = np.unique(data[:,0])
    num_drivers = drivers.shape[0]
    # Throw 15% of train data into sample folder
    sample_drivers_amount = int(np.floor(num_drivers*0.15))
    sample_drivers = np.random.choice(drivers, sample_drivers_amount, replace=False)

    # Throw 20% of train data into valid folder
    validation_drivers_amount = int(np.floor(num_drivers*0.2))
    validation_drivers = np.random.choice(drivers, validation_drivers_amount, replace=False)

    # Set up sample set
    for i in range(sample_drivers_amount):
        driver_name = sample_drivers[i]
        driver_columns = data[data[:,0] == driver_name]
        for j in range(10):
            driver_class = 'c' + str(j)
            dest = os.path.join(processed_path, 'sample', 'train', driver_class)
            class_columns = driver_columns[driver_columns[:,1] == driver_class]
            for filename in class_columns[:,2]:
                src = os.path.join(processed_path, 'train', driver_class, filename)
                shutil.copyfile(src, os.path.join(dest, filename))

    # Now move from sample_train to sample_validation a fraction of ~40%
    sample_drivers_validation_amount = int(np.floor(sample_drivers_amount*0.4))
    sample_drivers_validation = np.random.choice(sample_drivers, 
                                                 sample_drivers_validation_amount, 
                                                 replace=False)

    for i in range(sample_drivers_validation_amount):
        driver_name = sample_drivers_validation[i]
        driver_columns = data[data[:,0] == driver_name]
        for j in range(10):
            driver_class = 'c' + str(j)
            class_columns = driver_columns[driver_columns[:,1] == driver_class]
            for filename in class_columns[:,2]:
                dest = os.path.join(processed_path, 'sample', 'valid', driver_class, filename)
                src = os.path.join(processed_path, 'sample', 'train', driver_class, filename)
                shutil.move(src, dest)

    # Set up validation set
    for i in range(validation_drivers_amount):
        driver_name = validation_drivers[i]
        driver_columns = data[data[:,0] == driver_name]

        for j in range(10):
            driver_class = 'c' + str(j)
            class_columns = driver_columns[driver_columns[:,1] == driver_class]
            for filename in class_columns[:,2]:
                src = os.path.join(processed_path, 'train', driver_class, filename)
                dest = os.path.join(processed_path, 'valid', driver_class, filename)
                shutil.move(src, dest)

VGG16() setup boilerplate



In [ ]:

    
def add_conv_block(model, layers, filters):
    for i in range(layers):
        model.add(ZeroPadding2D((1,1)))
        model.add(Convolution2D(filters, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    return model
    
def add_fc_block(model, dropout):
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(dropout))
    return model



In [ ]:

    
class vgg16():
    def __init__(self, dropout=0.5):
        self.vgg_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32).reshape([3,1,1])
        self.create(dropout)
        
    def create(self, dropout):
        def vgg_preprocess(x, mean):
            mean = np.array(mean)
            x = x - mean
            return x[:,:,::-1]
        
        model = self.model = Sequential()
        
        model.add(Lambda(vgg_preprocess, 
                         input_shape=(3, 244, 244), 
                         output_shape=(3, 244, 244),
                         arguments = {'mean': self.vgg_mean.tolist()}
                        ))
        
        model = add_conv_block(model, 2, 64)
        model = add_conv_block(model, 2, 128)
        model = add_conv_block(model, 3, 256)
        model = add_conv_block(model, 3, 512)
        model = add_conv_block(model, 3, 512)
        
        model.add(Flatten())
        
        model = add_fc_block(model, dropout)
        model = add_fc_block(model, dropout)
        model.add(Dense(1000, activation='softmax'))
        
        model = model.load_weights(get_file(model_name, model_url+model_name, cache_subdir=cache_dir))

Load in data with generators

Here I set up the generators for the training and validation work



In [ ]:

    
DEBUG = True
data_dir = os.path.join(os.getcwd(), os.pardir, 'data')
model_dir = os.path.join(os.getcwd(), os.pardir, 'models')
if DEBUG == True:
    path = os.path.join(data_dir, 'processed', 'sample')
    batch_size = 4
    epochs = 2
elif DEBUG == False:
    path = os.path.join(data_dir, 'processed')
    batch_size = 64
    epochs = 5

train_path = os.path.join(path, 'train')
val_path = os.path.join(path, 'valid')
train_batches = ImageDataGenerator().flow_from_directory(train_path, 
                                                         target_size=(244,244), 
                                                         batch_size=batch_size, 
                                                         shuffle=True)
val_batches = ImageDataGenerator().flow_from_directory(val_path, 
                                                       target_size=(244,244), 
                                                       batch_size=batch_size, 
                                                       shuffle=True)

Finetuning the model

Now the top layer must be popped and replaced with a 10-output, which will correspond to our hot-encoding/softmax output
Then retrain model with new dense layer, which will be a good starting point for later fine tuning
Save the model, so that we can start toying with it in the next section



In [ ]:

    
lr = 0.001

model = vgg16(dropout=0.5).model
model.pop()
for layer in model.layers: layer.trainable=False
model.add(Dense(10, activation='softmax'))
model.compile(optimizer=Adam(lr), loss='categorical_crossentropy', metrics=['accuracy'])
model.fit_generator(train_batches, 
                    samples_per_epoch=train_batches.nb_sample, 
                    nb_epoch=epochs, 
                    validation_data=val_batches, 
                    nb_val_samples=val_batches.nb_sample)

model.save(os.path.join(model_dir, 'model_with_new_top.h5'))

New model architecture

Now that we have the trained model, we should probably make all the FC layers trainable. Additionally, we can start playing with:

learning rate schedule
batchnorm
data augmentation
setting different epochs
some other kind of regularisation?

First, import the model from when we saved it. Then:

Separate convolutional layers from fully connected ones
Make a new convolutional architecture with whatever we want to implement
Put them together
Train



In [6]:

    
old_model = load_model(os.path.join(os.getcwd(), 
                                    os.pardir, 
                                    'models', 
                                    'model_with_new_top.h5'))

Batch normalisation

Let's implement batch normalisation first. It'll speed up our looking for the adequate learning rate. From this link we know that BatchNorm() needs to be applied after the activation.



In [7]:

    
flatten_index = [index for index,layer in enumerate(old_model.layers) if type(layer).__name__ == 'Flatten'][0]

conv_model_layers = old_model.layers[1:flatten_index-1]
conv_model = Sequential(conv_model_layers)



In [4]:

    
def fc_model(dropout):
    model = Sequential()

    model.add(MaxPooling2D(input_shape=conv_model.layers[-1].output_shape[1:]))
    model.add(Flatten())
    
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout))
    
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout))
    
    model.add(Dense(10, activation='softmax'))
    return model

Data augmentation

Let's set up new batch generators, this time making use of augmented data. Remember, we only seek to augment our training input, no need to augment validation input (there's no learning taking place). train_batches generator is set to False because we're going to be saving it, and need reproducible inputs.

Some minor debug settings



In [17]:

    
DEBUG = False
data_dir = os.path.join(os.getcwd(), os.pardir, 'data')
model_dir = os.path.join(os.getcwd(), os.pardir, 'models')
test_path = os.path.join(path, 'test')
if DEBUG == True:
    path = os.path.join(data_dir, 'processed', 'sample')
    batch_size = 4
    epochs = 2
elif DEBUG == False:
    path = os.path.join(data_dir, 'processed')
    batch_size = 64
    epochs = 5

Conv stack output

Using only the convolutional part of VGG16, I generate the predictions, based on some augmented data, and save it to disk.



In [ ]:

    
train_path = os.path.join(path, 'train')
val_path = os.path.join(path, 'valid')

train_image_gen = ImageDataGenerator(rotation_range=15,
                                     height_shift_range=0.05,
                                     width_shift_range=0.1,
                                     shear_range = 0.1,
                                     channel_shift_range=20,
                                    )

aug_train_batches = train_image_gen.flow_from_directory(train_path, 
                                                    target_size=(244,244), 
                                                    batch_size=batch_size,
                                                    class_mode='categorical',
                                                    shuffle=False)

train_batches = ImageDataGenerator().flow_from_directory(train_path, 
                                                    target_size=(244,244), 
                                                    batch_size=batch_size,
                                                    class_mode='categorical',
                                                    shuffle=False)

val_batches = ImageDataGenerator().flow_from_directory(val_path, 
                                                       target_size=(244,244), 
                                                       batch_size=batch_size, 
                                                       shuffle=False)

print 'Predicting, this may take a while...'
conv_model_predictions_augmented = conv_model.predict_generator(aug_train_batches,
                                                                aug_train_batches.nb_sample*2,
                                                               )
conv_model_predictions = conv_model.predict_generator(train_batches,
                                                      train_batches.nb_sample,
                                                     )
val_predictions = conv_model.predict_generator(val_batches,
                                               val_batches.nb_sample,
                                              )

print 'Done predicting!'
# Concatenating augmented and non-augmented predictions
conv_model_predictions = np.concatenate([conv_model_predictions_augmented, conv_model_predictions])

prediction_labels = to_categorical(train_batches.classes)

prediction_labels = np.concatenate([prediction_labels]*3)

Test convolutions



In [ ]:

    
test_path = os.path.join(path, 'test')
test_generator = ImageDataGenerator().flow_from_directory(test_path, 
                                                    target_size=(244,244), 
                                                    batch_size=batch_size,
                                                    class_mode='categorical',
                                                    shuffle=False)
print 'Predicting test features, this might take a while...'
conv_model_test_inputs = conv_model.predict_generator(test_generator,
                                                      test_generator.nb_sample
                                                     )
print 'Done predicting!'



In [ ]:

    
save_array(os.path.join(model_dir, 'test_inputs.bc'), conv_model_test_inputs)

Save everything to disk

Saving everything to disk so I don't need to generate it every time



In [3]:

    
def save_array(location, array):
    instance = bcolz.carray(array, rootdir=location, mode='w')
    instance.flush()
    
def load_array(location):
    return bcolz.open(location)[:]



In [ ]:

    
save_array(os.path.join(model_dir, 'conv_predictions.bc'), conv_model_predictions)
save_array(os.path.join(model_dir, 'conv_labels.bc'), prediction_labels)
save_array(os.path.join(model_dir, 'val_predictions.bc'), val_predictions)
save_array(os.path.join(model_dir, 'val_labels.bc'), to_categorical(val_batches.classes))

Train fully connected layers only

Import data from disk



In [ ]:

    
conv_predictions = load_array(os.path.join(model_dir, 'conv_predictions.bc'))
conv_labels = load_array(os.path.join(model_dir, 'conv_labels.bc'))
conv_val_predictions = load_array(os.path.join(model_dir, 'val_predictions.bc'))
conv_val_labels = load_array(os.path.join(model_dir, 'val_labels.bc'))

Use data to train model



In [ ]:

    
dropout = 0.8
model = fc_model(dropout)
epochs = 10
lr = 0.0001
model.compile(optimizer=Adam(lr), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.optimizer.lr.set_value(lr)
model.fit(conv_predictions,
          conv_labels,
          batch_size=batch_size,
          nb_epoch=epochs,
          validation_data=(conv_val_predictions, conv_val_labels))



In [ ]:

    
lr = 0.00001
epochs = 2
model.optimizer.lr.set_value(lr)
model.fit(conv_predictions,
          conv_labels,
          batch_size=batch_size,
          nb_epoch=epochs,
          validation_data=(conv_val_predictions, conv_val_labels))



In [ ]:

    
model.save_weights(os.path.join(model_dir, 'final_predictor.h5'))

Load weights from trained model, and generate predictions



In [8]:

    
dropout = 0.8
model = fc_model(dropout)
lr = 0.0001
model.compile(optimizer=Adam(lr), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.optimizer.lr.set_value(lr)
model.load_weights(os.path.join(model_dir, 'final_predictor.h5'))



In [ ]:

    
test_input = load_array(os.path.join(model_dir, 'test_inputs.bc'))



In [10]:

    
test_predictions = model.predict(test_input)



In [14]:

    
test_predictions[1:3,:]









    Out[14]:





array([[  4.33069140e-01,   2.62587871e-02,   8.50916728e-02,
          2.25995868e-01,   2.30034534e-02,   4.61317264e-02,
          1.71671864e-02,   1.53106423e-02,   1.07109718e-01,
          2.08618529e-02],
       [  1.30285183e-03,   5.67891548e-05,   3.76506563e-04,
          1.87800475e-03,   9.90259051e-01,   6.48322923e-04,
          2.67745648e-03,   2.02435971e-04,   8.06084485e-04,
          1.79246720e-03]], dtype=float32)

Convert to proper CSV

Now that I have the predictions, I need to put it all into a proper .csv file that Kaggle will understand, that means:

Clip the values so as to not be killed too much by the Kaggle grading algorithm
Figuring out the corresponding filenames
Adding a proper header



In [15]:

    
clipped_predictions = np.clip(test_predictions, 0.02, 0.98)



In [21]:

    
filename_list = [filename for filename in os.listdir(os.path.join(test_path, 'unknown'))]



In [31]:

    
filename_array = np.transpose(np.array(filename_list, ndmin=2))



In [38]:

    
csv_headless = np.concatenate([filename_array, clipped_predictions], axis=1)



In [46]:

    
header_list = [
    'img',
    'c0', 
    'c1',
    'c2',
    'c3',
    'c4',
    'c5',
    'c6',
    'c7',
    'c8',
    'c9',
]
header_line = np.array(header_list, ndmin=2)



In [53]:

    
ans_array = np.concatenate([header_line, csv_headless])
# ans_array = ans_array.astype('|S10')



In [54]:

    
np.savetxt(os.path.join(data_dir, "submission.csv"), ans_array, delimiter=',', fmt='%s')



In [55]:

    
data_dir









    Out[55]:





'/home/ubuntu/homework/homework3/notebooks/../data'



In [ ]:

Table of Contents